library(here)
source(here("code", "plotting-functions.R"))
metadat <- readxl::read_excel(here("data", "raw", "ml_metadata.xlsx")) %>%
    rename(sample = group)
abs_abun_dat <- data.table::fread(here("data", "raw", "sample.final.shared")) %>%
    rename(sample = Group) %>%
    right_join(metadat %>%
        select(sample, pos_cdiff_d1))
abs_abun_dat$total_counts <- rowSums(abs_abun_dat %>%
    select(starts_with("Otu")))
rel_abun_dat <- abs_abun_dat %>%
    pivot_longer(starts_with("Otu"), names_to = "otu", values_to = "count") %>%
    mutate(rel_abun = count/total_counts, pos_cdiff_d1 = capwords(pos_cdiff_d1)) %>%
    select(sample, pos_cdiff_d1, otu, rel_abun)
smallest_non_zero <- rel_abun_dat %>%
    filter(rel_abun > 0) %>%
    slice_min(rel_abun, n = 1) %>%
    pull(rel_abun) %>%
    .[1]

feat_dat <- read_csv(here("results", "feature-importance_results.csv"))
tax_dat <- schtools::read_tax(here("data", "processed", "final.taxonomy.tsv"))
top_feats_rel_abun <- get_top_feats(feat_dat, tax_dat, alpha_level = 0.05) %>%
    select(otu, label) %>%
    left_join(rel_abun_dat, by = "otu") %>%
    mutate(rel_abun_c = rel_abun + smallest_non_zero/10, rel_abun_1 = rel_abun +
        1)

Original relative abundance, no transformation

top_feats_rel_abun %>%
    plot_rel_abun(xcol = rel_abun)

Original relative abundance, log\(_{10}\)-transformed

top_feats_rel_abun %>%
    plot_rel_abun(xcol = rel_abun) + scale_x_log10()

Removes 363 rows since log10(0) = -Inf, so we need to add a constant

Relative abundance + 1, no transformation

top_feats_rel_abun %>%
    plot_rel_abun(xcol = rel_abun_1)

Relative abundance + 1, log\(_{10}\)-transformed

top_feats_rel_abun %>%
    plot_rel_abun(xcol = rel_abun_1) + scale_x_log10()

…that looks the same as above, just +1. Is there a problem in the scaling function? What if I manually log-10 transform it?

top_feats_rel_abun %>%
    plot_rel_abun(xcol = log10(rel_abun_1))

The x axis numbers are different but the scale is still linear! So maybe adding 1 is the problem?

Relative abundance + tiny constant, no transformation

top_feats_rel_abun %>%
    plot_rel_abun(xcol = rel_abun_c)

Relative abundance + tiny constant, log\(_{10}\)-transformed

top_feats_rel_abun %>%
    plot_rel_abun(xcol = rel_abun_c) + scale_x_log10()

What does log_\({10}\)-transforming do to a fake dataset?

constant <- 1e-04
dat <- tibble(rel_abun = seq.int(0, 1, by = 0.01)) %>%
    mutate(rel_abun_log10 = log10(rel_abun), rel_abun_1_log10 = log10(rel_abun +
        1), rel_abun_c_log10 = log10(rel_abun + constant), rank = row_number())
set1colors <- RColorBrewer::brewer.pal(9, "Set1")
dat %>%
    pivot_longer(c(starts_with("rel_abun")), names_to = "rel_abun_scale", values_to = "rel_abun_value") %>%
    mutate(rel_abun_scale = case_when(rel_abun_scale == "rel_abun_1_log10" ~ "log10(rel_abun + 1)",
        rel_abun_scale == "rel_abun_c_log10" ~ "log10(rel_abun + C)", rel_abun_scale ==
            "rel_abun_log10" ~ "log10(rel_abun)", TRUE ~ rel_abun_scale)) %>%
    ggplot(aes(rel_abun_value, rank, color = rel_abun_scale)) + geom_line(aes(size = rel_abun_scale),
    alpha = 0.5) + scale_size_manual(values = c(rel_abun = 1.5, `log10(rel_abun)` = 1.5,
    `log10(rel_abun + C)` = 2.5, `log10(rel_abun + 1)` = 1.5)) + scale_color_manual(values = c(rel_abun = set1colors[9],
    `log10(rel_abun)` = set1colors[5], `log10(rel_abun + C)` = set1colors[2], `log10(rel_abun + 1)` = set1colors[1])) +
    theme_bw()